source('../../workflow/resources/annotateVariants.R')
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sampleName <- 'Br11'
inputFolder <- '/cluster/work/bewi/members/jgawron/projects/CTC/input_folder'
For each cluster (defined by color), we computed a pairwise distance for each mutation pair that indicates how often the two mutations occur in the same private branch of cells from the cluster:
dist(M1, M2) = 0 (for M1 = M2)
dist(M1,M2) = 1 - (%samples where M1 and M2 are both in the same private branch of a cell from the cluster) (elsewise)
A private branch is defined as the path from a leaf to the node just below the LCA of this leaf to another leaf from the same cluster.
This is a generalization of the earlier method to find the top seperating mutations of pairs of leafs. The generalization was necessary to handle the larger clusters that were broken in more than 2 pieces.
clusterName <- 'lightcoral'
d <- read.table(file.path(inputFolder, sampleName, paste0(sampleName, '_postSampling_',clusterName,'.txt') ),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
mat<-as.matrix(d)
mat[1:4, 1:4]
## chr6_32293617 chr5_139276028 chr7_65399410 chr3_121257206
## chr6_32293617 0.000000 0.964925 0.999925 0.954200
## chr5_139276028 0.964925 0.000000 1.000000 0.982775
## chr7_65399410 0.999925 1.000000 0.000000 1.000000
## chr3_121257206 0.954200 0.982775 1.000000 0.000000
For each position, we computed the percentage of samples that have a coverage of at least 3 at this position. This is meant as a simple score of the data quality of a position that can be used in addition to the separation score to pick mutations for the wet lab experiments. Furthermore, we added simple functional annotations to the variants.
coverage<-read.table(file.path(inputFolder, sampleName, paste(sampleName, 'covScore.txt', sep = '_')),header=TRUE,sep="\t", stringsAsFactors=F, row.names=1)
coverage$variantName <- rownames(coverage)
head(coverage)
## covScore variantName
## chr6_32293617 0.7222222 chr6_32293617
## chr5_139276028 0.5000000 chr5_139276028
## chr7_65399410 0.6666667 chr7_65399410
## chr3_121257206 0.5555556 chr3_121257206
## chrX_290695 0.2777778 chrX_290695
## chr1_109573768 0.3888889 chr1_109573768
annotations <- annotate_variants(sampleName, inputFolder)
coverage <- inner_join(coverage, annotations, by = "variantName")
###Overview To get an overview, we plot the full distance matrix:
library(heatmaply)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
heatmaply(mat)
mat2 <- mat
diag(mat2) <- 1
min_dist <- apply(mat2, 1, min) # find minimum distance to other mutations
selected_muts <- which(min_dist<0.9) # select those below 0.5 say
mat2 <- mat[selected_muts, selected_muts]
This is what the distance matrix looks like now:
heatmaply(mat2)
coverage %>% filter(variantName %in% colnames(mat2))
## covScore variantName REF ALT relevant
## 1 0.7222222 chr6_32293617 T A MODERATE
## 2 0.3888889 chr1_109573768 G A NONE
## 3 0.5555556 chr6_83656797 G C MODERATE
## 4 0.3888889 chr4_133162921 A C NONE
## 5 0.6111111 chrX_141698461 T G NONE
## 6 0.3888889 chr16_12002913 G A NONE
## 7 0.5000000 chr17_12133785 G A NONE
## 8 0.3888889 chr2_218644549 A C MODERATE
## 9 0.5000000 chr18_7002377 G A NONE
## 10 0.7777778 chr3_100825799 G A MODERATE
## 11 0.2222222 chr10_44931932 A T NONE
## 12 0.5555556 chr3_121490363 A G NONE
## 13 0.6666667 chr14_19499672 G A NONE
## 14 0.7222222 chr14_19499787 T G NONE
## 15 0.5555556 chr6_111372656 G C MODERATE
## 16 0.4444444 chr6_112087431 T G MODERATE
## 17 0.2222222 chr3_10358774 G A MODERATE
## 18 0.3888889 chr4_146503823 T G NONE
## 19 0.8333333 chr3_75669309 C G NONE
## 20 0.6666667 chr6_30951754 T C MODERATE
## 21 0.4444444 chr6_157174912 A G NONE
## 22 0.2777778 chr6_35080094 C A NONE
## 23 0.3333333 chr20_2462660 G A MODERATE
## 24 0.2777778 chr22_43928750 A G NONE
To cluster mutations, we create a dendrogram based on the pairwise distances:
d_mat <- as.dist(mat)
hc <- hclust(d_mat, "average") ## hierarchical clustering of mutations based on distance matrix
par(cex=0.6)
plot(hc, main = "Dendrogram based on average pairwise distance", sub = "", xlab = "Separating mutations")
No apparent clustering visible.